library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6 ✔ purrr 0.3.4
## ✔ tibble 3.1.8 ✔ dplyr 1.0.10
## ✔ tidyr 1.2.1 ✔ stringr 1.4.1
## ✔ readr 2.1.2 ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(ggrepel)
We have learned the fundamental functionalities of ggplot. Let’s refine the plots.
setwd("~/Documents/ibs_course/BUS240/data")
load("asasec.rda")
head(asasec)
## Section Sname Beginning Revenues
## 1 Aging and the Life Course (018) Aging 12752 12104
## 2 Alcohol, Drugs and Tobacco (030) Alcohol/Drugs 11933 1144
## 3 Altruism and Social Solidarity (047) Altruism 1139 1862
## 4 Animals and Society (042) Animals 473 820
## 5 Asia/Asian America (024) Asia 9056 2116
## 6 Body and Embodiment (048) Body 3408 1618
## Expenses Ending Journal Year Members
## 1 12007 12849 No 2005 598
## 2 400 12677 No 2005 301
## 3 1875 1126 No 2005 NA
## 4 1116 177 No 2005 209
## 5 1710 9462 No 2005 365
## 6 1920 3106 No 2005 NA
This is some data on membership over time in special-interest sections of the American Sociological Association.
In this dataset, we have membership data for each section over a ten year period.
Let’s look at the relationship between section membership and section revenues for a single year, 2014.
p <- ggplot(data = subset(asasec, Year == 2014),
mapping = aes(x = Members, y = Revenues, label = Sname))
p + geom_point() + geom_smooth()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
This is our basic scatterplot-and-smoother graph. The defalut geom_smooth method is loess (local weighted regression).
To refine it, let’s begin by identifying some outliers, switch from loess to OLS, and introduce a third variable, Journal, into the aes() which is inside geom_point
unique(asasec$Journal)
## [1] No Yes
## Levels: No Yes
p <- ggplot(data = subset(asasec, Year == 2014),
mapping = aes(x = Members, y = Revenues, label = Sname))
p + geom_point(mapping = aes(color = Journal)) +
geom_smooth(method = "lm")
## `geom_smooth()` using formula 'y ~ x'
Now we can add some text labels. Let’s build it additively.
p0 <- ggplot(data = subset(asasec, Year == 2014),
mapping = aes(x = Members, y = Revenues, label = Sname))
p1 <- p0 + geom_smooth(method = "lm", se = FALSE, color = "gray80") +
geom_point(mapping = aes(color = Journal))
p1
## `geom_smooth()` using formula 'y ~ x'
p2 <- p1 + geom_text_repel(data=subset(asasec,
Year == 2014 & Revenues > 7000),
size = 2)
p2
## `geom_smooth()` using formula 'y ~ x'
Continuing with the p2 object still, we can label the axes and scales.
We also add a title and move the legend to make better use of the space
in the plot.
p3 <- p2 + labs(x="Membership",
y="Revenues",
color = "Section has own Journal",
title = "ASA Sections",
subtitle = "2014 Calendar year.",
caption = "Source: ASA annual report.")
p3
## `geom_smooth()` using formula 'y ~ x'
p4 <- p3 + scale_y_continuous(labels = scales::dollar) +
theme(legend.position = "bottom")
p4
## `geom_smooth()` using formula 'y ~ x'
Consider: ordered vs. unordered categorical variable. What’s the examples?
There are many ways to color your figures, so make sure that the following example is just one of them. We use the RColorBrewer package to make a wide range of named color palettes available. Together with ggplot, you access these colors by specifying the scale_color_brewer() or scale_fill_brewer() functions, depending on the aesthetic you are mapping.
See more reference: https://ggplot2.tidyverse.org/reference/scale_brewer.html
Let’s use organdata we used before.
setwd("~/Documents/ibs_course/BUS240/data")
load("organdata.rda")
head(organdata)
## # A tibble: 6 × 21
## country year donors pop pop_d…¹ gdp gdp_lag health healt…² pubhe…³
## <chr> <date> <dbl> <int> <dbl> <int> <int> <dbl> <dbl> <dbl>
## 1 Australia NA NA 17065 0.220 16774 16591 1300 1224 4.8
## 2 Australia 1991-01-01 12.1 17284 0.223 17171 16774 1379 1300 5.4
## 3 Australia 1992-01-01 12.4 17495 0.226 17914 17171 1455 1379 5.4
## 4 Australia 1993-01-01 12.5 17667 0.228 18883 17914 1540 1455 5.4
## 5 Australia 1994-01-01 10.2 17855 0.231 19849 18883 1626 1540 5.4
## 6 Australia 1995-01-01 10.2 18072 0.233 21079 19849 1737 1626 5.5
## # … with 11 more variables: roads <dbl>, cerebvas <int>, assault <int>,
## # external <int>, txp_pop <dbl>, world <chr>, opt <chr>, consent_law <chr>,
## # consent_practice <chr>, consistent <chr>, ccode <chr>, and abbreviated
## # variable names ¹pop_dens, ²health_lag, ³pubhealth
p <- ggplot(data = organdata,
mapping = aes(x = roads, y = donors, color = world))
p + geom_point(size = 2)
## Warning: Removed 34 rows containing missing values (geom_point).
p + geom_point(size = 2) + scale_color_brewer(palette = "Set2")
## Warning: Removed 46 rows containing missing values (geom_point).
p + geom_point(size = 2) + scale_color_brewer(palette = "Set2") +
theme(legend.position = "top")
## Warning: Removed 46 rows containing missing values (geom_point).
p + geom_point(size = 2) + scale_color_brewer(palette = "Pastel2") +
theme(legend.position = "top")
## Warning: Removed 46 rows containing missing values (geom_point).
p + geom_point(size = 2) + scale_color_brewer(palette = "Dark2") +
theme(legend.position = "top")
## Warning: Removed 46 rows containing missing values (geom_point).
You can also specify colors manually, via scale_color_manual() or scale_fill_manual(). These functions take a value argument that can be specified as vector of color names or color values that R knows about.
R knows many color names (like red, and green, and cornflowerblue. Try demo(‘colors’) for an overview.
Alternatively, color values can be specified via their hexadecimal RGB value. This is a way of encoding color values in the RGB colorspace, where each channel can take a value from 0 to 255 like this. A color hex value begins with a hash or pound character, #, followed by three pairs of hexadecimal or “hex” numbers. Hex values are in Base 16, with the first six letters of the alphabet standing for the numbers 10 to 15. This allows a two-character hex number to range from 0 to 255. You read them as #rrggbb, where rr is the two-digit hex code for the red channel, gg for the green channel, and bb for the blue channel. So #CC55DD translates in decimal to CC = 204 (red), 55 = 85 (green), and DD = 221 (blue). It gives a strong pink color.
Going back to our ASA Membership plot, for example, we can manually introduce a palette that’s friendly to color-blind viewers.
cb_palette <- c("#999999", "#E69F00", "#56B4E9", "#009E73",
"#F0E442", "#0072B2", "#D55E00", "#CC79A7")
p4 + scale_color_manual(values = cb_palette)
## `geom_smooth()` using formula 'y ~ x'
The ability to manually specify colors can be useful when the meaning of a category itself has a strong color association. For example, blue and red for political parties in U.S. What else?
Let’s work through an example where we use manually specified colors both for emphasis and because of their social meaning. We will build up a plot of data about the 2016 US general election used in the last lecture.
setwd("~/Documents/ibs_course/BUS240/data")
load("county_data.rda")
party_colors <- c("#2E74C0", "#CB454A") # Democrat Blue and Republican Red
Note this variables:
# id. FIPS State and County code (character)
# pop. Population, 2014 estimate
# flipped Did the area flip parties from 2012 to 2016
# black. Black alone, percent, 2013
county_data %>% select(id, pop, flipped, black) %>% sample_n(8)
## id pop flipped black
## 1 31037 10504 No 2.6
## 2 28117 25428 No 14.0
## 3 31051 5782 No 0.6
## 4 55109 86759 No 0.7
## 5 06029 874589 No 6.3
## 6 48417 3343 No 1.3
## 7 48423 218842 No 17.9
## 8 08011 5630 No 6.8
p0 <- ggplot(data = subset(county_data,
flipped == "No"),
mapping = aes(x = pop,
y = black/100))
p0 + geom_point(color = "gray50")
p0 + geom_point(alpha = 0.15, color = "gray50")
p0 + geom_point(alpha = 0.15, color = "gray50") + scale_x_log10(labels=scales::comma)
Now consider
#partywinner16. Winning party, 2016 Presidental Election.
county_data %>% select(id, partywinner16) %>% sample_n(8)
## id partywinner16
## 1 26085 Republican
## 2 48315 Republican
## 3 37187 Democrat
## 4 13205 Republican
## 5 30081 Republican
## 6 32029 Republican
## 7 13295 Republican
## 8 48119 Republican
# save previous results in p1
p1 <- p0 + geom_point(alpha = 0.15, color = "gray50") + scale_x_log10(labels=scales::comma)
p2 <- p1 + geom_point(data = subset(county_data,
flipped == "Yes"),
mapping = aes(x = pop, y = black/100,
color = partywinner16)) +
scale_color_manual(values = party_colors)
p2
Clean up the labels
p3 <- p2 + scale_y_continuous(labels=scales::percent) +
labs(color = "County flipped to ... ",
x = "County Population (log scale)",
y = "Percent Black Population",
title = "Flipped counties, 2016",
caption = "Counties in gray did not flip.")
p3
Last, add a third layer using geom_text_repel() function. Once again we supply a set of instructions to subset the data for this text layer. We are interested in the flipped counties that have with a relatively high percentage of African-American residents.
p4 <- p3 + geom_text_repel(data = subset(county_data,
flipped == "Yes" &
black > 25),
mapping = aes(x = pop,
y = black/100,
label = state), size = 2)
p4
## Warning: ggrepel: 1 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
p4 + theme_minimal() + theme(legend.position="top")
Consider how much information we use from data to picture this.
Need more style?
theme_set(theme_bw())
p4 + theme(legend.position="top")
theme_set(theme_dark())
p4 + theme(legend.position="top")
Professional touch (not artistic)? Use ggthemes.
library(ggthemes)
theme_set(theme_economist())
p4 + theme(legend.position="top")
## Warning: ggrepel: 1 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
theme_set(theme_wsj())
p4 + theme(plot.title = element_text(size = rel(0.6)),
legend.title = element_text(size = rel(0.35)),
plot.caption = element_text(size = rel(0.35)),
legend.position = "top")
library(cowplot)
##
## Attaching package: 'cowplot'
## The following object is masked from 'package:ggthemes':
##
## theme_map
theme_set(theme_minimal())
setwd("~/Documents/ibs_course/BUS240/data")
load("fredts.rda")
head(fredts)
## date sp500 monbase sp500_i monbase_i
## 1 2009-03-11 696.68 1542228 100.0000 100.0000
## 2 2009-03-18 766.73 1693133 110.0548 109.7849
## 3 2009-03-25 799.10 1693133 114.7012 109.7849
## 4 2009-04-01 809.06 1733017 116.1308 112.3710
## 5 2009-04-08 830.61 1733017 119.2240 112.3710
## 6 2009-04-15 852.21 1789878 122.3245 116.0579
fredts_m <- fredts %>% select(date, sp500_i, monbase_i) %>%
gather(key = series, value = score, sp500_i:monbase_i)
head(fredts_m)
## date series score
## 1 2009-03-11 sp500_i 100.0000
## 2 2009-03-18 sp500_i 110.0548
## 3 2009-03-25 sp500_i 114.7012
## 4 2009-04-01 sp500_i 116.1308
## 5 2009-04-08 sp500_i 119.2240
## 6 2009-04-15 sp500_i 122.3245
p <- ggplot(data = fredts_m,
mapping = aes(x = date, y = score,
group = series,
color = series))
p1 <- p + geom_line() + theme(legend.position = "top") +
labs(x = "Date",
y = "Index",
color = "Series")
p1
## Don't know how to automatically pick scale for object of type ts. Defaulting to continuous.
p <- ggplot(data = fredts,
mapping = aes(x = date, y = sp500_i - monbase_i))
p2 <- p + geom_line() +
labs(x = "Date",
y = "Difference")
p2
## Don't know how to automatically pick scale for object of type ts. Defaulting to continuous.
cowplot::plot_grid(p1, p2, nrow = 2, rel_heights = c(0.75, 0.25), align = "v")
## Don't know how to automatically pick scale for object of type ts. Defaulting to continuous.
## Don't know how to automatically pick scale for object of type ts. Defaulting to continuous.